OPTIONS nodate NONUMBER NOPRINTMSGLIST linesize=200 pagesize=100; * Abbreviate size of log file ;
%let Name = LA ;
%let Path = C:\Users\Mihaiela\Desktop\My files\Publications, manuscripts in progress\Measure of democracy\HCA data;
libname loc "&Path";

/* Read-in data and create unique identifier for each Nation-Year. Keep country
   code short (1-3 letters; otherwise, the dendogram will be adversely affected.
   Make sure that a unique letter is assigned to each country. */
data HCA;
	set loc.&Name;
	format NationYR $10.;
	YR=put(Year,4.);
	YR=substr(YR,3,2);
	NationYR=cats(Country,YR);
	drop YR;
	_id=_n_;
run;

/* Create standardized variables for the democracy indices. */
proc iml;
	use HCA;
	read all var{FH Van Democ Elecsd Polity} into HCA;
	read all var{_id} into id;
	close HCA;

	START MAIN;
		col=ncol(HCA);
		x=j(nrow(HCA),col,.);
		do i=1 to col;
			if i=col then HCA[,i]=HCA[,i]+9; /* Transform range from (-9,10) to (0,19) */
			/* Standardized variable based on formula 5 of Milligan and Cooper (1988) */
			x[,i]=(HCA[,i]-min(HCA[,i]))/(max(HCA[,i])-min(HCA[,i]));
		end;
		x=id||x;
		CREATE HCA_std FROM x [c={_id FH_std Van_std Democ_std Elecsd_std Polity_std}];
		APPEND FROM x;
	FINISH MAIN;
RUN;QUIT;

/* Merge standardized variables with original data set. */
data HCA;
	merge HCA HCA_std;
	by _id;
	drop _id;
run;

/* Delete standardized data set */
proc datasets nolist;
	delete HCA_std;
run; quit;
	
/* Examine variance of each variable. Scaling is a problem if variances are very different. */
title "Basic statistics for standardized indices";
proc means data=HCA mean var min max;
	var FH_std Van_std Democ_std Elecsd_std Polity_std ;
run;

/* Use when clusters are elongated and elliptical. This Proc performs a linear transformation 
on the raw data before the cluster analysis. */
proc aceclus data=HCA out=HCA p=.03 noprint;
	var FH_std Van_std Democ_std Elecsd_std Polity_std ;
run;

/* k=10 limits cluster size to min of 10 elements. Need to run proc twice. The first time use
   Trim=10 (10% trimming of outliers). Determine the number of clusters and the cluster
   identity for the non-outliers. Save this data set to Excel. The second step is to run with 
   Trim=0. Use this to identify the probable clusters for the outliers. Then, fill in the missing
   cluster identities for the outliers in the Excel file. This file is then imported and merged
   with the primary analysis file. These steps ensure that the outliers do not change the cluster
   identities of the "good" data (non-outliers). Note, these steps are NOT include in the code below. */
ods graphics on;
title "Cluster analysis";
proc cluster data=HCA method=WARD trim=10 k=10 NONORM ccc pseudo print=30 outtree=Tree ;
   var can:;
   id NationYR;
   copy FH Van Democ Elecsd Polity;
   format NationYR $15.;
run;
ods graphics off;

/* RUN TWICE: once for out=Cluster and once of out=Cluster1
   Cluster=full file no trimming, Cluster1=with trimming	*/
goptions vsize=10in hsize=10in htext=2.5pct htitle=3pct ftext='Times New Roman' 
	device=EMF gsfmode=replace gsfname=gout TRANSPARENCY ;
filename gout "&Path\..\SAS analyses and syntax\&Name..emf";
title "Latin America"; * change title to match data set.;
axis1 order=(0 to 1 by 0.1) label = (height=3pct 'R-squared') ;

/* Generate dendogram. Note, due to large number of cases, you will need to use
   Microsoft Paint to make the figure presentable. */
ods graphics;
/* The number of clusters was determined based on examination of the previous output.
   Specifically, we looked at CCC, pseudo-F, pseudo-T squared, and R-squared. This
   examination indicated that the most parsimonious model is a 3-cluster solution. */
proc tree data=Tree out=Clusters(drop=CLUSNAME) nclusters=3 similar
          haxis=axis1 VPAGES=1 horizontal  ;
	where _FREQ_ >-1;
	height _rsq_;
	copy FH Van Democ Elecsd Polity;
	id NationYR;
run;
ods graphics off;

proc sort data=clusters;
	by cluster NationYR;
run;
/* -------------------------------------------------------------- */

/* Output basic statistics on each cluster. These will help interpret the clusters. */
title "Means by each cluster";
proc means data=clusters mean n alpha=0.05 clm maxdec=2;
	by cluster;
	var FH Van Democ Elecsd Polity;
run;

title "Means across the three clusters";
proc means data=clusters mean n alpha=0.05 clm maxdec=2;
	var FH Van Democ Elecsd Polity;
run;
title;

/* Rescale Polity per guidance provided by original authors. */
data clusters;
	set clusters;

	if Polity<=-6 then Polity2=1;
		else if Polity<=5 then Polity2=2;
		else Polity2=3;
run;

proc sort data=Clusters;
	by NationYr;
run;

Proc sort data=HCA;
	by NationYr;
run;

/* Create permanent dataset. */
Data loc.LA_cl3;
	merge HCA(keep=Nation Year NationYr)
		  Clusters;
	by NationYr;
	ID=cats(Nation,Year);
run;

/* Produce frequency for Nation/Year to more easily identify the cluster structure of the data. */
proc freq data=loc.LA_cl3 noprint;
	table ID*cluster/norow nocol nopercent out=ClusterID(drop=count Percent);
run;
